library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(lme4)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(modelr)
library(viridis)
## Loading required package: viridisLite
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.2.3
library(latex2exp)
library(ggridges)
## Warning: package 'ggridges' was built under R version 4.2.3
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(lmerTest)
##
## Attaching package: 'lmerTest'
##
## The following object is masked from 'package:lme4':
##
## lmer
##
## The following object is masked from 'package:stats':
##
## step
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(broom.mixed)
## Warning: package 'broom.mixed' was built under R version 4.2.3
library(jtools)
dirname <- "/Users/nathanielimel/uci/projects/citesim/src/notebooks/"
get_data_v <- function(vectorizer) {
fp <- paste(dirname, vectorizer, "_main_data.csv", sep="")
v_data <- read_csv(fp)
v_data$vectorizer <- vectorizer
return(v_data)
}
df_all <- rbind(
get_data_v("SciBERT"),
get_data_v("SBERT"),
get_data_v("GPT2"),
get_data_v("Word2Vec"),
get_data_v("BOW")
)
## Rows: 2375 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): field
## dbl (13): density_bin, freq, count, log_cpy entropy, log_cpy variance, log_c...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 2302 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): field
## dbl (13): density_bin, freq, count, log_cpy entropy, log_cpy variance, log_c...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4112 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): field
## dbl (13): density_bin, freq, count, log_cpy entropy, log_cpy variance, log_c...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 2790 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): field
## dbl (13): density_bin, freq, count, log_cpy entropy, log_cpy variance, log_c...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 2649 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): field
## dbl (13): density_bin, freq, count, log_cpy entropy, log_cpy variance, log_c...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# df_all <- read_csv("/Users/nathanielimel/uci/projects/citesim/src/notebooks/SBERT_SciBERT_GPT2_main_data.csv")
head(df_all)
## # A tibble: 6 × 15
## densit…¹ freq count log_c…² log_c…³ log_c…⁴ cpy_var cpy_med cpy_m…⁵ ref_med
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 29.8 1.08e-3 12 NA 0.434 0.660 89.9 2.37 7.00 69.5
## 2 29.9 9.86e-4 11 NA 0.578 0.827 526. 6 14.2 42
## 3 30.4 1.08e-3 12 NA 0.440 0.802 1346. 6.06 17.4 60
## 4 30.5 1.08e-3 12 NA 0.383 0.420 28.8 2.28 4.48 46.5
## 5 30.7 1.34e-3 15 NA 0.361 0.684 33.7 1.73 5.38 53
## 6 30.9 1.08e-3 12 NA 0.652 0.389 1710. 1.68 19.8 59.5
## # … with 5 more variables: ref_var <dbl>, year_med <dbl>, field <chr>,
## # log_cpy_var <dbl>, vectorizer <chr>, and abbreviated variable names
## # ¹density_bin, ²`log_cpy entropy`, ³`log_cpy variance`, ⁴`log_cpy median`,
## # ⁵cpy_mean
get_anova <- function(y, vectorizer_name) {
df_vectorizer <- df_all %>% filter(
vectorizer == vectorizer_name
) %>% mutate(
# z-scale
y_z = scale(.data[[y]]),
density_bin_z = scale(density_bin),
year_med_z = scale(year_med)
)
# Model 4: age, density, and random density slopes and intercepts
label_4_both <- paste(
# y,
# " ~ year_med + density_bin + (1 + density_bin | field)"
"y_z ~ year_med_z + density_bin_z + (1 + density_bin_z | field)"
)
fm_4_both <- as.formula(label_4_both)
model_4_both <- lmer(
fm_4_both,
data=df_vectorizer,
control=lmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e5))
)
# Model 3: density and random intercepts
label_3_rho <- paste(
# y,
# "~ density_bin + (1 | field)"
"y_z ~ density_bin_z + (1 | field)"
)
fm_3_rho <- as.formula(label_3_rho)
model_3_rho <- lmer(
fm_3_rho,
data=df_vectorizer,
control=lmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e5))
)
# Model 2: age and random intercepts
label_2_age <- paste(
# y,
# " ~ year_med + (1 | field)"
"y_z ~ year_med_z + (1 | field)"
)
fm_2_age <- as.formula(label_2_age)
model_2_age <- lmer(fm_2_age, data=df_vectorizer)
# Model 1: intercept only (mean)
label_1_int <- paste(
# y,
# " ~ 1"
"y_z ~ 1"
)
fm_1_int <- as.formula(label_1_int)
model_1_int <- lm(fm_1_int, data=df_vectorizer)
results <- anova(
model_4_both,
model_3_rho,
model_2_age,
model_1_int
)
results_df <- tidy(results)
results_model_4_both <- results_df %>% filter(
term == "model_4_both",
)
results_annotated <- results_df %>% mutate(
bic_gain = results$BIC - results_model_4_both$BIC,
vectorizer = vectorizer_name,
predictor = y,
# Always check that the results are actually ordered in this way...should use a dict but oh well
# fms = c(label_4_both, label_3_rho, label_2_age, label_1_int),
# fm = c(label_1_int, label_2_age, label_3_rho, label_4_both),
)
print(summary(model_4_both))
print("------------------------------")
# print(results)
return(results_annotated)
}
# test the function
# get_anova("log_cpy_var", "SBERT")
# get_anova("cpy_med", "SBERT")
# get_anova("log_cpy_var", "SciBERT")
# get_anova("cpy_med", "SciBERT")
# get_anova("log_cpy_var", "GPT2")
# get_anova("log_cpy_var", "Word2Vec")
get_anova("log_cpy_var", "BOW")
## refitting model(s) with ML (instead of REML)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_4_both
## Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 6432.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.8108 -0.6673 -0.1173 0.5447 7.0614
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## field (Intercept) 0.2393 0.4892
## density_bin_z 0.0608 0.2466 -0.42
## Residual 0.6489 0.8056
## Number of obs: 2649, groups: field, 8
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.03158 0.17588 7.07875 -0.180 0.8625
## year_med_z 0.01213 0.02194 1778.76178 0.553 0.5802
## density_bin_z -0.24505 0.09675 6.35506 -2.533 0.0424 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) yr_md_
## year_med_z -0.051
## densty_bn_z -0.321 -0.172
## [1] "------------------------------"
## # A tibble: 4 × 12
## term npar AIC BIC logLik devia…¹ stati…² df p.value bic_g…³
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 model_1_int 2 7521. 7532. -3758. 7517. NA NA NA 1055.
## 2 model_3_rho 4 6455. 6479. -3224. 6447. 1069. 2 6.55e-233 1.62
## 3 model_2_age 4 6496. 6520. -3244. 6488. 0 0 NA 42.6
## 4 model_4_both 7 6436. 6477. -3211. 6422. 66.3 3 2.67e- 14 0
## # … with 2 more variables: vectorizer <chr>, predictor <chr>, and abbreviated
## # variable names ¹deviance, ²statistic, ³bic_gain
df_plot <- rbind(
get_anova("log_cpy_var", "SciBERT"),
get_anova("log_cpy_var", "SBERT"),
get_anova("cpy_med", "SciBERT"),
get_anova("cpy_med", "SBERT")
) %>% filter(
term != "model_4_both",
term != "model_1_int",
)
## refitting model(s) with ML (instead of REML)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_4_both
## Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 5254.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.8654 -0.5932 -0.0719 0.4756 7.9005
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## field (Intercept) 0.19106 0.4371
## density_bin_z 0.05907 0.2431 0.60
## Residual 0.51870 0.7202
## Number of obs: 2375, groups: field, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.02079 0.14695 8.05646 -0.142 0.89095
## year_med_z 0.03674 0.02344 2328.71793 1.567 0.11718
## density_bin_z -0.46196 0.08412 8.66008 -5.492 0.00044 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) yr_md_
## year_med_z -0.024
## densty_bn_z 0.582 -0.174
## [1] "------------------------------"
## refitting model(s) with ML (instead of REML)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_4_both
## Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 5561.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.6399 -0.6302 -0.0828 0.4669 9.1671
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## field (Intercept) 0.20129 0.4487
## density_bin_z 0.01563 0.1250 0.10
## Residual 0.63986 0.7999
## Number of obs: 2302, groups: field, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 5.055e-02 1.530e-01 8.206e+00 0.330 0.749307
## year_med_z -4.007e-03 2.690e-02 1.699e+03 -0.149 0.881613
## density_bin_z -3.171e-01 5.514e-02 1.010e+01 -5.752 0.000178 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) yr_md_
## year_med_z -0.055
## densty_bn_z 0.133 -0.421
## [1] "------------------------------"
## refitting model(s) with ML (instead of REML)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_4_both
## Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 3697.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.3665 -0.4812 -0.1186 0.3347 11.4436
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## field (Intercept) 0.60811 0.7798
## density_bin_z 0.06356 0.2521 0.68
## Residual 0.26697 0.5167
## Number of obs: 2375, groups: field, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -1.253e-01 2.603e-01 8.005e+00 -0.481 0.643
## year_med_z 9.221e-03 1.691e-02 2.370e+03 0.545 0.586
## density_bin_z 2.359e-01 8.561e-02 8.299e+00 2.755 0.024 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) yr_md_
## year_med_z -0.010
## densty_bn_z 0.667 -0.124
## [1] "------------------------------"
## refitting model(s) with ML (instead of REML)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_4_both
## Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 3761.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.7080 -0.4459 -0.0888 0.2831 11.3816
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## field (Intercept) 0.59315 0.7702
## density_bin_z 0.03085 0.1756 0.68
## Residual 0.28993 0.5385
## Number of obs: 2302, groups: field, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.24693 0.25776 8.05023 -0.958 0.3659
## year_med_z 0.01639 0.01840 2277.08893 0.891 0.3731
## density_bin_z 0.12776 0.06389 9.62107 2.000 0.0745 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) yr_md_
## year_med_z -0.024
## densty_bn_z 0.642 -0.255
## [1] "------------------------------"
(
ggplot(
df_plot,
aes(
x=bic_gain,
y=term,
fill=vectorizer,
)
)
+ geom_bar( stat = "identity", position = "dodge")
# + xlim(-4000, -1000)
+ theme_classic()
# + scale_y_discrete(position = "right")
# + scale_fill_manual( values = c("lightblue", "darkblue"))
# + scale_fill_brewer()
# + facet_grid("vectorizer ~ predictor")
)

# it's hard to visualize via faceting because it makes significant differences in BIC look insignificant. So let's just try printing 4 separate plots with diff scales?
plot_bic <- function(df_plot) {
df_plot <- df_plot %>% filter(
# term != "model_4_both",
term != "model_1_int",
)
plot <- (
ggplot(
df_plot,
aes(
x=bic_gain,
y=term,
)
)
+ geom_bar( stat = "identity", position = "dodge")
+ theme_classic()
)
return(plot)
}
for (predictor in c("cpy_med", "log_cpy_var")){
for (vectorizer in c("SciBERT", "SBERT")){
print(plot_bic(
get_anova(predictor, vectorizer)
))
}
}
## refitting model(s) with ML (instead of REML)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_4_both
## Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 3697.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.3665 -0.4812 -0.1186 0.3347 11.4436
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## field (Intercept) 0.60811 0.7798
## density_bin_z 0.06356 0.2521 0.68
## Residual 0.26697 0.5167
## Number of obs: 2375, groups: field, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -1.253e-01 2.603e-01 8.005e+00 -0.481 0.643
## year_med_z 9.221e-03 1.691e-02 2.370e+03 0.545 0.586
## density_bin_z 2.359e-01 8.561e-02 8.299e+00 2.755 0.024 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) yr_md_
## year_med_z -0.010
## densty_bn_z 0.667 -0.124
## [1] "------------------------------"
## refitting model(s) with ML (instead of REML)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_4_both
## Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 3761.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.7080 -0.4459 -0.0888 0.2831 11.3816
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## field (Intercept) 0.59315 0.7702
## density_bin_z 0.03085 0.1756 0.68
## Residual 0.28993 0.5385
## Number of obs: 2302, groups: field, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.24693 0.25776 8.05023 -0.958 0.3659
## year_med_z 0.01639 0.01840 2277.08893 0.891 0.3731
## density_bin_z 0.12776 0.06389 9.62107 2.000 0.0745 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) yr_md_
## year_med_z -0.024
## densty_bn_z 0.642 -0.255
## [1] "------------------------------"
## refitting model(s) with ML (instead of REML)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_4_both
## Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 5254.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.8654 -0.5932 -0.0719 0.4756 7.9005
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## field (Intercept) 0.19106 0.4371
## density_bin_z 0.05907 0.2431 0.60
## Residual 0.51870 0.7202
## Number of obs: 2375, groups: field, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.02079 0.14695 8.05646 -0.142 0.89095
## year_med_z 0.03674 0.02344 2328.71793 1.567 0.11718
## density_bin_z -0.46196 0.08412 8.66008 -5.492 0.00044 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) yr_md_
## year_med_z -0.024
## densty_bn_z 0.582 -0.174
## [1] "------------------------------"
## refitting model(s) with ML (instead of REML)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_4_both
## Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 5561.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.6399 -0.6302 -0.0828 0.4669 9.1671
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## field (Intercept) 0.20129 0.4487
## density_bin_z 0.01563 0.1250 0.10
## Residual 0.63986 0.7999
## Number of obs: 2302, groups: field, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 5.055e-02 1.530e-01 8.206e+00 0.330 0.749307
## year_med_z -4.007e-03 2.690e-02 1.699e+03 -0.149 0.881613
## density_bin_z -3.171e-01 5.514e-02 1.010e+01 -5.752 0.000178 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) yr_md_
## year_med_z -0.055
## densty_bn_z 0.133 -0.421
## [1] "------------------------------"

get_summary <- function(y, vectorizer_name) {
df_vectorizer <- df_all %>% filter(
vectorizer == vectorizer_name
) %>% mutate(
# z-scale
y_z = scale(.data[[y]]),
density_bin_z = scale(density_bin),
year_med_z = scale(year_med)
)
# Model 4: age, density, and random density slopes and intercepts
label_4_both <- paste(
# y,
# " ~ year_med + density_bin + (1 + density_bin | field)"
"y_z ~ year_med_z + density_bin_z + (1 + density_bin_z | field)"
)
fm_4_both <- as.formula(label_4_both)
model_4_both <- lmer(
fm_4_both,
data=df_vectorizer,
control=lmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e5))
)
print(summary(model_4_both))
return(model_4_both)
}
tidy(get_summary("log_cpy_var", "SciBERT"))
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_4_both
## Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 5254.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.8654 -0.5932 -0.0719 0.4756 7.9005
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## field (Intercept) 0.19106 0.4371
## density_bin_z 0.05907 0.2431 0.60
## Residual 0.51870 0.7202
## Number of obs: 2375, groups: field, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.02079 0.14695 8.05646 -0.142 0.89095
## year_med_z 0.03674 0.02344 2328.71793 1.567 0.11718
## density_bin_z -0.46196 0.08412 8.66008 -5.492 0.00044 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) yr_md_
## year_med_z -0.024
## densty_bn_z 0.582 -0.174
## # A tibble: 7 × 8
## effect group term estim…¹ std.e…² stati…³ df p.value
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 fixed <NA> (Intercept) -0.0208 0.147 -0.142 8.06 8.91e-1
## 2 fixed <NA> year_med_z 0.0367 0.0234 1.57 2329. 1.17e-1
## 3 fixed <NA> density_bin_z -0.462 0.0841 -5.49 8.66 4.40e-4
## 4 ran_pars field sd__(Intercept) 0.437 NA NA NA NA
## 5 ran_pars field cor__(Intercept).d… 0.602 NA NA NA NA
## 6 ran_pars field sd__density_bin_z 0.243 NA NA NA NA
## 7 ran_pars Residual sd__Observation 0.720 NA NA NA NA
## # … with abbreviated variable names ¹estimate, ²std.error, ³statistic
getrefs <- function(y, vectorizer_name, print_summary=TRUE) {
df_vectorizer <- df_all %>% filter(
vectorizer == vectorizer_name
) %>% mutate(
# z-scale
y_z = scale(.data[[y]]),
density_bin_z = scale(density_bin),
year_med_z = scale(year_med),
ref_med_z = scale(ref_med),
freq_z = scale(freq),
) %>% filter(
y_z < 3,
y_z > -3,
density_bin_z < 3,
density_bin_z > -3,
ref_med_z < 3,
ref_med_z > -3,
freq_z < 3,
freq_z > -3
)
# Model 5:
label_5 <- paste(
# "y_z ~ year_med_z + density_bin_z + (1 | field)"
# "y_z ~ ref_med_z + year_med_z + density_bin_z + (1 | field)"
# "y_z ~ ref_med_z + year_med_z + density_bin_z + (1 + density_bin_z + ref_med_z | field)"
"y_z ~ ref_med_z + year_med_z + freq_z + density_bin_z + (1 + density_bin_z | field)"
# "y_z ~ ref_med_z + year_med_z + freq_z + density_bin_z + (1 | field)"
)
fm_5 <- as.formula(label_5)
model_5 <- lmer(
fm_5,
data=df_vectorizer,
control=lmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e5))
)
if (print_summary) {
print(summary(model_5))
}
# model_ref_rho <- lmer(
# ref_med_z ~ density_bin_z + (1 + density_bin_z | field),
# data=df_vectorizer,
# control=lmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e5))
# )
#
# print(summary(model_ref_rho))
return(model_5)
}
# summary(getrefs("log_cpy_var", "GPT2", FALSE))
getrefs("log_cpy_var", "SciBERT", FALSE)
## Linear mixed model fit by REML ['lmerModLmerTest']
## Formula: fm_5
## Data: df_vectorizer
## REML criterion at convergence: 4653.778
## Random effects:
## Groups Name Std.Dev. Corr
## field (Intercept) 0.4310
## density_bin_z 0.2200 0.63
## Residual 0.6541
## Number of obs: 2299, groups: field, 9
## Fixed Effects:
## (Intercept) ref_med_z year_med_z freq_z density_bin_z
## -0.03233 -0.03303 0.03197 -0.02746 -0.40171
# sigs_dfs <- tibble()
#
# for (vec in unique(df_all$vectorizer)) {
# sigs_df <- tidy(getrefs(y, vec, FALSE)) %>% filter(
# effect == "fixed",
# term != "(Intercept)"
# ) %>% select(
# term,
# estimate,
# p.value,
# ) %>% mutate(
# model = vec
# )
# sigs_dfs <- rbind(sigs_dfs, sigs_df)
#
# }
#
# sigs_dfs <- sigs_dfs %>% mutate(
# significance = `p.value` < 0.05
# )
# sigs_dfs
for (y in c("cpy_med", "log_cpy_var")) {
print(
plot_summs(
getrefs(y, "SciBERT",),
getrefs(y, "SBERT"),
getrefs(y, "GPT2"),
getrefs(y, "Word2Vec"),
getrefs(y, "BOW"),
model.names = c("SciBERT", "SBERT", "GPT2", "Word2Vec", "BOW"),
# model.names = c("SciBERT", "SBERT", "GPT2"),
colors = "Rainbow",
point.shape = FALSE,
robust=TRUE,
inner_ci_level = .9
# plot.distributions = TRUE
)
# + scale_color_viridis(discrete=TRUE)
# + scale_color_manual( values = c("blue", "red"))
+ theme(
axis.text.x = element_text(size=20)
)
# Add shapes for significance
)
}
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_5
## Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 2674.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.6905 -0.5340 -0.1130 0.4452 4.6943
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## field (Intercept) 0.44155 0.6645
## density_bin_z 0.05775 0.2403 0.54
## Residual 0.17929 0.4234
## Number of obs: 2293, groups: field, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -1.079e-01 2.220e-01 7.989e+00 -0.486 0.6399
## ref_med_z 1.512e-01 1.829e-02 2.288e+03 8.266 2.32e-16 ***
## year_med_z -7.479e-03 1.436e-02 2.286e+03 -0.521 0.6025
## freq_z 1.117e-02 1.888e-02 2.280e+03 0.592 0.5542
## density_bin_z 1.950e-01 8.142e-02 8.292e+00 2.394 0.0425 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) rf_md_ yr_md_ freq_z
## ref_med_z 0.024
## year_med_z -0.017 -0.173
## freq_z -0.015 0.005 0.105
## densty_bn_z 0.530 -0.034 -0.103 -0.038
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_5
## Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 2701.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.9371 -0.5057 -0.0747 0.3894 5.9759
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## field (Intercept) 0.44247 0.6652
## density_bin_z 0.01275 0.1129 0.60
## Residual 0.18860 0.4343
## Number of obs: 2236, groups: field, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -2.219e-01 2.230e-01 8.073e+00 -0.995 0.349
## ref_med_z 1.435e-01 2.086e-02 1.718e+03 6.878 8.46e-12 ***
## year_med_z 7.255e-03 1.528e-02 2.197e+03 0.475 0.635
## freq_z -6.840e-02 4.327e-02 2.226e+03 -1.581 0.114
## density_bin_z 5.579e-02 4.346e-02 1.057e+01 1.284 0.227
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) rf_md_ yr_md_ freq_z
## ref_med_z 0.030
## year_med_z -0.028 -0.172
## freq_z -0.011 0.010 -0.003
## densty_bn_z 0.535 -0.143 -0.271 -0.002
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_5
## Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 5886
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.9412 -0.5544 -0.1130 0.4424 5.8618
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## field (Intercept) 0.49398 0.7028
## density_bin_z 0.01357 0.1165 -0.33
## Residual 0.24985 0.4999
## Number of obs: 3981, groups: field, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 2.354e-02 2.344e-01 7.984e+00 0.100 0.9225
## ref_med_z 1.445e-01 1.396e-02 3.973e+03 10.346 <2e-16 ***
## year_med_z 2.255e-02 1.039e-02 3.972e+03 2.170 0.0300 *
## freq_z 1.159e-02 9.363e-03 3.968e+03 1.237 0.2160
## density_bin_z 9.694e-02 4.007e-02 8.230e+00 2.419 0.0411 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) rf_md_ yr_md_ freq_z
## ref_med_z -0.001
## year_med_z 0.001 -0.341
## freq_z 0.001 -0.010 -0.041
## densty_bn_z -0.318 0.033 -0.054 -0.088
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_5
## Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 3704.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.9179 -0.5408 -0.0782 0.3886 5.4505
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## field (Intercept) 0.50971 0.7139
## density_bin_z 0.01459 0.1208 0.38
## Residual 0.22268 0.4719
## Number of obs: 2697, groups: field, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.15774 0.23824 7.96916 -0.662 0.5266
## ref_med_z 0.11736 0.02164 2382.66994 5.423 6.47e-08 ***
## year_med_z 0.01173 0.01630 2551.20271 0.720 0.4718
## freq_z -0.01933 0.01129 2682.29647 -1.713 0.0868 .
## density_bin_z -0.03373 0.04454 10.16069 -0.757 0.4660
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) rf_md_ yr_md_ freq_z
## ref_med_z 0.010
## year_med_z -0.001 -0.146
## freq_z -0.004 0.018 -0.053
## densty_bn_z 0.339 -0.210 -0.245 -0.004
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_5
## Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 3905.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.5459 -0.5489 -0.0859 0.4321 5.8348
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## field (Intercept) 0.5847 0.7646
## density_bin_z 0.0170 0.1304 0.70
## Residual 0.2675 0.5172
## Number of obs: 2519, groups: field, 8
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.21031 0.27111 6.95790 -0.776 0.46346
## ref_med_z 0.12214 0.02114 2291.08060 5.777 8.63e-09 ***
## year_med_z 0.01549 0.01459 1152.08085 1.062 0.28867
## freq_z -0.03822 0.01996 2469.16024 -1.915 0.05558 .
## density_bin_z -0.19559 0.05392 8.53926 -3.627 0.00602 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) rf_md_ yr_md_ freq_z
## ref_med_z 0.017
## year_med_z -0.020 -0.194
## freq_z 0.005 -0.068 -0.089
## densty_bn_z 0.619 -0.096 -0.176 0.082

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_5
## Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 4653.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.8335 -0.6259 -0.0553 0.5520 5.1991
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## field (Intercept) 0.18578 0.4310
## density_bin_z 0.04841 0.2200 0.63
## Residual 0.42789 0.6541
## Number of obs: 2299, groups: field, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.03233 0.14558 7.89619 -0.222 0.829872
## ref_med_z -0.03303 0.02801 2016.02733 -1.179 0.238497
## year_med_z 0.03197 0.02208 2218.18905 1.448 0.147875
## freq_z -0.02746 0.02899 2194.71086 -0.947 0.343731
## density_bin_z -0.40171 0.07671 8.89641 -5.237 0.000558 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) rf_md_ yr_md_ freq_z
## ref_med_z 0.053
## year_med_z -0.038 -0.168
## freq_z -0.032 0.019 0.092
## densty_bn_z 0.599 -0.058 -0.169 -0.056
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_5
## Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 4933.4
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.8750 -0.6754 -0.0699 0.5681 4.2046
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## field (Intercept) 0.205341 0.4531
## density_bin_z 0.007956 0.0892 0.24
## Residual 0.515845 0.7182
## Number of obs: 2239, groups: field, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 0.05704 0.15533 8.12097 0.367 0.722830
## ref_med_z 0.04669 0.03338 576.65815 1.398 0.162519
## year_med_z -0.01986 0.02471 1550.82590 -0.804 0.421697
## freq_z 0.03613 0.07152 2025.50493 0.505 0.613520
## density_bin_z -0.26608 0.04531 10.88059 -5.873 0.000112 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) rf_md_ yr_md_ freq_z
## ref_med_z 0.062
## year_med_z -0.057 -0.166
## freq_z -0.018 0.012 -0.014
## densty_bn_z 0.195 -0.231 -0.414 0.029
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_5
## Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 9401.4
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.9047 -0.6735 -0.1070 0.5424 3.9004
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## field (Intercept) 0.15236 0.3903
## density_bin_z 0.01323 0.1150 -0.18
## Residual 0.61230 0.7825
## Number of obs: 3970, groups: field, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.10504 0.13083 7.92441 -0.803 0.44545
## ref_med_z -0.05524 0.02181 3558.29844 -2.533 0.01136 *
## year_med_z -0.08828 0.01618 3879.27605 -5.456 5.18e-08 ***
## freq_z 0.04531 0.01460 3959.59851 3.104 0.00192 **
## density_bin_z -0.07805 0.04129 8.64592 -1.890 0.09261 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) rf_md_ yr_md_ freq_z
## ref_med_z -0.002
## year_med_z 0.003 -0.346
## freq_z 0.003 -0.009 -0.043
## densty_bn_z -0.172 0.051 -0.078 -0.131
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_5
## Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 6221.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.6222 -0.6627 -0.1041 0.5764 4.0578
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## field (Intercept) 0.2191 0.4680
## density_bin_z 0.0429 0.2071 0.02
## Residual 0.5702 0.7551
## Number of obs: 2696, groups: field, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 0.06713 0.15705 7.94359 0.427 0.6804
## ref_med_z -0.02488 0.03458 2338.14436 -0.719 0.4719
## year_med_z -0.04334 0.02598 2480.56694 -1.668 0.0954 .
## freq_z 0.02210 0.01815 2685.82851 1.218 0.2234
## density_bin_z -0.02122 0.07546 9.74491 -0.281 0.7844
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) rf_md_ yr_md_ freq_z
## ref_med_z 0.024
## year_med_z -0.003 -0.151
## freq_z -0.010 0.022 -0.048
## densty_bn_z 0.015 -0.198 -0.229 -0.006
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_5
## Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 5872
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.9663 -0.6909 -0.0868 0.5984 4.3496
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## field (Intercept) 0.23887 0.4887
## density_bin_z 0.05737 0.2395 -0.40
## Residual 0.58292 0.7635
## Number of obs: 2525, groups: field, 8
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -4.924e-02 1.758e-01 6.885e+00 -0.280 0.7876
## ref_med_z 5.566e-03 3.088e-02 1.754e+03 0.180 0.8570
## year_med_z 2.747e-02 2.177e-02 1.595e+03 1.262 0.2072
## freq_z 7.067e-03 2.951e-02 2.482e+03 0.239 0.8108
## density_bin_z -2.765e-01 9.469e-02 6.404e+00 -2.920 0.0247 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) rf_md_ yr_md_ freq_z
## ref_med_z 0.038
## year_med_z -0.055 -0.201
## freq_z 0.015 -0.060 -0.095
## densty_bn_z -0.307 -0.080 -0.163 0.075

# Looks like number of references is a better predictor of the typical citation rates than density.
# However, we have very good reason to believe that density causes number of references, not the other way around!
# Create a new df grouped by vectorizer and field (should have done this above, refactor)
df_grouped <- df_all %>%
group_by(field, vectorizer) %>%
mutate_at(vars(cpy_med, density_bin, year_med, ref_med),
list(z = ~scale(.))) %>%
filter(cpy_med_z < 3,
cpy_med_z > -3,
density_bin_z < 3,
density_bin_z > -3,
ref_med_z < 3,
ref_med_z > -3)
# head(df_grouped)
for (vec in unique(df_grouped$vectorizer)) {
df_vec <- df_grouped %>% filter(
vectorizer == vec
)
plot <- (
ggplot(
df_vec,
aes(
# x=density_bin_z,
# y=ref_med_z,
x=density_bin,
y=ref_med,
# color=field,
)
)
+ geom_smooth(
method="lm",
linewidth=3,
color="black",
)
+ geom_point(
aes(
color=field,
),
size=0.5,
alpha=0.2,
)
+ geom_smooth(
aes(
color=field,
),
# method="lm",
linewidth=1,
)
# + facet_grid("~ vectorizer")
# + facet_wrap("vectorizer")
+ theme_classic()
+ ggtitle(vec)
+ theme(
axis.text = element_text(size=20)
)
)
print(plot)
}
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# ggsave(
# "refs_by_vectorizers.png",
# width=12,
# height=3,
# )
getrefs("cpy_med", "SciBERT", TRUE)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_5
## Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 2674.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.6905 -0.5340 -0.1130 0.4452 4.6943
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## field (Intercept) 0.44155 0.6645
## density_bin_z 0.05775 0.2403 0.54
## Residual 0.17929 0.4234
## Number of obs: 2293, groups: field, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -1.079e-01 2.220e-01 7.989e+00 -0.486 0.6399
## ref_med_z 1.512e-01 1.829e-02 2.288e+03 8.266 2.32e-16 ***
## year_med_z -7.479e-03 1.436e-02 2.286e+03 -0.521 0.6025
## freq_z 1.117e-02 1.888e-02 2.280e+03 0.592 0.5542
## density_bin_z 1.950e-01 8.142e-02 8.292e+00 2.394 0.0425 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) rf_md_ yr_md_ freq_z
## ref_med_z 0.024
## year_med_z -0.017 -0.173
## freq_z -0.015 0.005 0.105
## densty_bn_z 0.530 -0.034 -0.103 -0.038
## Linear mixed model fit by REML ['lmerModLmerTest']
## Formula: fm_5
## Data: df_vectorizer
## REML criterion at convergence: 2674.497
## Random effects:
## Groups Name Std.Dev. Corr
## field (Intercept) 0.6645
## density_bin_z 0.2403 0.54
## Residual 0.4234
## Number of obs: 2293, groups: field, 9
## Fixed Effects:
## (Intercept) ref_med_z year_med_z freq_z density_bin_z
## -0.107935 0.151210 -0.007479 0.011169 0.194950
getrefs("cpy_med", "SBERT", TRUE)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_5
## Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 2701.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.9371 -0.5057 -0.0747 0.3894 5.9759
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## field (Intercept) 0.44247 0.6652
## density_bin_z 0.01275 0.1129 0.60
## Residual 0.18860 0.4343
## Number of obs: 2236, groups: field, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -2.219e-01 2.230e-01 8.073e+00 -0.995 0.349
## ref_med_z 1.435e-01 2.086e-02 1.718e+03 6.878 8.46e-12 ***
## year_med_z 7.255e-03 1.528e-02 2.197e+03 0.475 0.635
## freq_z -6.840e-02 4.327e-02 2.226e+03 -1.581 0.114
## density_bin_z 5.579e-02 4.346e-02 1.057e+01 1.284 0.227
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) rf_md_ yr_md_ freq_z
## ref_med_z 0.030
## year_med_z -0.028 -0.172
## freq_z -0.011 0.010 -0.003
## densty_bn_z 0.535 -0.143 -0.271 -0.002
## Linear mixed model fit by REML ['lmerModLmerTest']
## Formula: fm_5
## Data: df_vectorizer
## REML criterion at convergence: 2701.799
## Random effects:
## Groups Name Std.Dev. Corr
## field (Intercept) 0.6652
## density_bin_z 0.1129 0.60
## Residual 0.4343
## Number of obs: 2236, groups: field, 9
## Fixed Effects:
## (Intercept) ref_med_z year_med_z freq_z density_bin_z
## -0.221861 0.143473 0.007255 -0.068404 0.055791
getrefs("cpy_med", "GPT2", TRUE)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_5
## Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 5886
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.9412 -0.5544 -0.1130 0.4424 5.8618
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## field (Intercept) 0.49398 0.7028
## density_bin_z 0.01357 0.1165 -0.33
## Residual 0.24985 0.4999
## Number of obs: 3981, groups: field, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 2.354e-02 2.344e-01 7.984e+00 0.100 0.9225
## ref_med_z 1.445e-01 1.396e-02 3.973e+03 10.346 <2e-16 ***
## year_med_z 2.255e-02 1.039e-02 3.972e+03 2.170 0.0300 *
## freq_z 1.159e-02 9.363e-03 3.968e+03 1.237 0.2160
## density_bin_z 9.694e-02 4.007e-02 8.230e+00 2.419 0.0411 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) rf_md_ yr_md_ freq_z
## ref_med_z -0.001
## year_med_z 0.001 -0.341
## freq_z 0.001 -0.010 -0.041
## densty_bn_z -0.318 0.033 -0.054 -0.088
## Linear mixed model fit by REML ['lmerModLmerTest']
## Formula: fm_5
## Data: df_vectorizer
## REML criterion at convergence: 5886.016
## Random effects:
## Groups Name Std.Dev. Corr
## field (Intercept) 0.7028
## density_bin_z 0.1165 -0.33
## Residual 0.4999
## Number of obs: 3981, groups: field, 9
## Fixed Effects:
## (Intercept) ref_med_z year_med_z freq_z density_bin_z
## 0.02354 0.14448 0.02255 0.01159 0.09694
getrefs("cpy_med", "Word2Vec", TRUE)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_5
## Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 3704.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.9179 -0.5408 -0.0782 0.3886 5.4505
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## field (Intercept) 0.50971 0.7139
## density_bin_z 0.01459 0.1208 0.38
## Residual 0.22268 0.4719
## Number of obs: 2697, groups: field, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.15774 0.23824 7.96916 -0.662 0.5266
## ref_med_z 0.11736 0.02164 2382.66994 5.423 6.47e-08 ***
## year_med_z 0.01173 0.01630 2551.20271 0.720 0.4718
## freq_z -0.01933 0.01129 2682.29647 -1.713 0.0868 .
## density_bin_z -0.03373 0.04454 10.16069 -0.757 0.4660
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) rf_md_ yr_md_ freq_z
## ref_med_z 0.010
## year_med_z -0.001 -0.146
## freq_z -0.004 0.018 -0.053
## densty_bn_z 0.339 -0.210 -0.245 -0.004
## Linear mixed model fit by REML ['lmerModLmerTest']
## Formula: fm_5
## Data: df_vectorizer
## REML criterion at convergence: 3704.345
## Random effects:
## Groups Name Std.Dev. Corr
## field (Intercept) 0.7139
## density_bin_z 0.1208 0.38
## Residual 0.4719
## Number of obs: 2697, groups: field, 9
## Fixed Effects:
## (Intercept) ref_med_z year_med_z freq_z density_bin_z
## -0.15774 0.11736 0.01173 -0.01933 -0.03373
getrefs("cpy_med", "BOW", TRUE)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_5
## Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 3905.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.5459 -0.5489 -0.0859 0.4321 5.8348
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## field (Intercept) 0.5847 0.7646
## density_bin_z 0.0170 0.1304 0.70
## Residual 0.2675 0.5172
## Number of obs: 2519, groups: field, 8
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.21031 0.27111 6.95790 -0.776 0.46346
## ref_med_z 0.12214 0.02114 2291.08060 5.777 8.63e-09 ***
## year_med_z 0.01549 0.01459 1152.08085 1.062 0.28867
## freq_z -0.03822 0.01996 2469.16024 -1.915 0.05558 .
## density_bin_z -0.19559 0.05392 8.53926 -3.627 0.00602 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) rf_md_ yr_md_ freq_z
## ref_med_z 0.017
## year_med_z -0.020 -0.194
## freq_z 0.005 -0.068 -0.089
## densty_bn_z 0.619 -0.096 -0.176 0.082
## Linear mixed model fit by REML ['lmerModLmerTest']
## Formula: fm_5
## Data: df_vectorizer
## REML criterion at convergence: 3905.925
## Random effects:
## Groups Name Std.Dev. Corr
## field (Intercept) 0.7646
## density_bin_z 0.1304 0.70
## Residual 0.5172
## Number of obs: 2519, groups: field, 8
## Fixed Effects:
## (Intercept) ref_med_z year_med_z freq_z density_bin_z
## -0.21031 0.12214 0.01549 -0.03822 -0.19559
# Create a new df grouped by vectorizer and field (should have done this above, refactor)
df_grouped <- df_all %>%
group_by(field, vectorizer) %>%
mutate_at(vars(cpy_med, density_bin, year_med, ref_med),
list(z = ~scale(.))) %>%
filter(cpy_med_z < 3,
cpy_med_z > -3,
density_bin_z < 3,
density_bin_z > -3,
ref_med_z < 3,
ref_med_z > -3,
# year_med_z < 3,
# year_med_z > -3
)
# head(df_grouped)
# drop gpt for now
for (vec in unique(df_grouped$vectorizer)) {
df_vec <- df_grouped %>% filter(
vectorizer == vec
)
plot <- (
ggplot(
df_vec,
aes(
# x=year_med_z,
# y=density_bin_z,
# What happens if we don't consider the z-score? Then GPT screws stuff up
x=year_med,
y=density_bin,
# color=field,
)
)
+ geom_smooth(
method="lm",
linewidth=3,
color="black",
)
+ geom_point(
aes(
color=field,
),
size=0.5,
alpha=0.2,
)
+ geom_smooth(
aes(
color=field,
),
# method="lm",
linewidth=1,
)
# + facet_grid("~ vectorizer")
# + facet_wrap("vectorizer")
+ theme_classic()
+ ggtitle(vec)
+ theme(
axis.text = element_text(size=20)
)
)
print(plot)
}
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#
# ggsave(
# "rho_vs_year_by_vectorizers.png",
# width=12,
# height=3,
# )
# NOTE: need to loop over y var too!
for (y in c("cpy_med", "log_cpy_var")){
for (vec in unique(df_grouped$vectorizer)) {
df_vec <- df_all %>% filter(
vectorizer == vec
) %>% mutate(
# z-scale
y_z = scale(.data[[y]]),
density_bin_z = scale(density_bin),
year_med_z = scale(year_med),
ref_med_z = scale(ref_med),
freq_z = scale(freq),
) %>% filter(
y_z < 3,
y_z > -3,
density_bin_z < 3,
density_bin_z > -3,
ref_med_z < 3,
ref_med_z > -3
)
main_model <- lmer(
y_z ~ ref_med_z + year_med_z + freq_z + density_bin_z + (1 | field),
data=df_vec,
control=lmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e5))
)
ablated_model <- lmer(
y_z ~ ref_med_z + year_med_z + freq_z + (1 | field),
data=df_vec,
control=lmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e5))
)
simpler_model <- lmer(
y_z ~ density_bin_z + (1 | field),
data=df_vec,
control=lmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e5))
)
print("--------------------")
print(paste(vec, "x", y))
print("--------------------")
# print(anova(
# main_model,
# ablated_model
# ))
# print("\n")
# print(summary(simpler_model))
print(summary(main_model))
}
}
## [1] "--------------------"
## [1] "SciBERT x cpy_med"
## [1] "--------------------"
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: y_z ~ ref_med_z + year_med_z + freq_z + density_bin_z + (1 |
## field)
## Data: df_vec
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 3179.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.8824 -0.5529 -0.0661 0.4726 4.5826
##
## Random effects:
## Groups Name Variance Std.Dev.
## field (Intercept) 0.4221 0.6497
## Residual 0.2223 0.4715
## Number of obs: 2325, groups: field, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -1.405e-01 2.171e-01 7.974e+00 -0.647 0.53568
## ref_med_z 1.048e-01 1.887e-02 2.319e+03 5.557 3.05e-08 ***
## year_med_z 4.183e-02 1.469e-02 2.319e+03 2.848 0.00444 **
## freq_z -1.843e-03 1.478e-02 2.303e+03 -0.125 0.90076
## density_bin_z 1.875e-01 1.426e-02 2.317e+03 13.147 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) rf_md_ yr_md_ freq_z
## ref_med_z 0.019
## year_med_z -0.015 -0.166
## freq_z -0.035 0.027 0.116
## densty_bn_z 0.012 -0.199 -0.539 -0.237
## [1] "--------------------"
## [1] "SBERT x cpy_med"
## [1] "--------------------"
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: y_z ~ ref_med_z + year_med_z + freq_z + density_bin_z + (1 |
## field)
## Data: df_vec
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 2771.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.0400 -0.5073 -0.0726 0.3905 6.2222
##
## Random effects:
## Groups Name Variance Std.Dev.
## field (Intercept) 0.4047 0.6361
## Residual 0.1927 0.4389
## Number of obs: 2263, groups: field, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -1.786e-01 2.129e-01 8.062e+00 -0.839 0.426
## ref_med_z 2.100e-01 1.856e-02 2.258e+03 11.317 <2e-16 ***
## year_med_z 1.679e-02 1.481e-02 2.258e+03 1.134 0.257
## freq_z -1.699e-03 1.234e-02 2.229e+03 -0.138 0.890
## density_bin_z 1.667e-02 1.806e-02 2.257e+03 0.924 0.356
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) rf_md_ yr_md_ freq_z
## ref_med_z 0.023
## year_med_z -0.023 -0.217
## freq_z -0.050 0.021 0.014
## densty_bn_z 0.015 -0.245 -0.570 -0.036
## [1] "--------------------"
## [1] "GPT2 x cpy_med"
## [1] "--------------------"
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: y_z ~ ref_med_z + year_med_z + freq_z + density_bin_z + (1 |
## field)
## Data: df_vec
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 6086.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.0228 -0.5643 -0.1108 0.4398 5.8682
##
## Random effects:
## Groups Name Variance Std.Dev.
## field (Intercept) 0.4641 0.6812
## Residual 0.2608 0.5107
## Number of obs: 4017, groups: field, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 3.505e-03 2.272e-01 7.986e+00 0.015 0.988
## ref_med_z 1.308e-01 1.394e-02 4.012e+03 9.387 <2e-16 ***
## year_med_z 1.189e-02 1.048e-02 4.010e+03 1.134 0.257
## freq_z 1.181e-03 8.980e-03 4.004e+03 0.132 0.895
## density_bin_z 1.295e-01 9.514e-03 4.007e+03 13.607 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) rf_md_ yr_md_ freq_z
## ref_med_z -0.001
## year_med_z 0.001 -0.348
## freq_z -0.001 -0.028 -0.043
## densty_bn_z -0.001 0.155 -0.195 -0.354
## [1] "--------------------"
## [1] "Word2Vec x cpy_med"
## [1] "--------------------"
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: y_z ~ ref_med_z + year_med_z + freq_z + density_bin_z + (1 |
## field)
## Data: df_vec
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 3805
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.9918 -0.5621 -0.0729 0.3970 5.6923
##
## Random effects:
## Groups Name Variance Std.Dev.
## field (Intercept) 0.4590 0.6775
## Residual 0.2281 0.4776
## Number of obs: 2739, groups: field, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.12863 0.22603 7.98175 -0.569 0.585
## ref_med_z 0.19219 0.01917 2733.85142 10.024 < 2e-16 ***
## year_med_z 0.00393 0.01512 2733.12028 0.260 0.795
## freq_z -0.02472 0.01044 2729.12118 -2.367 0.018 *
## density_bin_z -0.06946 0.01718 2730.13329 -4.043 5.43e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) rf_md_ yr_md_ freq_z
## ref_med_z 0.008
## year_med_z 0.001 -0.180
## freq_z -0.006 0.006 -0.047
## densty_bn_z -0.004 -0.422 -0.565 -0.010
## [1] "--------------------"
## [1] "BOW x cpy_med"
## [1] "--------------------"
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: y_z ~ ref_med_z + year_med_z + freq_z + density_bin_z + (1 |
## field)
## Data: df_vec
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 3995.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.7436 -0.5248 -0.0816 0.4096 6.3947
##
## Random effects:
## Groups Name Variance Std.Dev.
## field (Intercept) 0.5732 0.7571
## Residual 0.2630 0.5128
## Number of obs: 2611, groups: field, 8
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -1.762e-01 2.680e-01 6.976e+00 -0.658 0.5318
## ref_med_z 1.410e-01 2.044e-02 2.604e+03 6.899 6.56e-12 ***
## year_med_z 6.554e-03 1.341e-02 2.603e+03 0.489 0.6250
## freq_z -2.925e-02 1.327e-02 2.606e+03 -2.205 0.0275 *
## density_bin_z -1.712e-01 1.981e-02 2.606e+03 -8.642 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) rf_md_ yr_md_ freq_z
## ref_med_z 0.017
## year_med_z -0.001 -0.208
## freq_z -0.015 -0.064 -0.082
## densty_bn_z -0.002 -0.242 -0.160 0.123
## [1] "--------------------"
## [1] "SciBERT x log_cpy_var"
## [1] "--------------------"
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: y_z ~ ref_med_z + year_med_z + freq_z + density_bin_z + (1 |
## field)
## Data: df_vec
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 4903.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.9213 -0.6187 -0.0704 0.5328 4.9642
##
## Random effects:
## Groups Name Variance Std.Dev.
## field (Intercept) 0.1820 0.4266
## Residual 0.4675 0.6837
## Number of obs: 2331, groups: field, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 0.01470 0.14379 7.88232 0.102 0.92113
## ref_med_z -0.04561 0.02733 2139.36161 -1.669 0.09534 .
## year_med_z -0.04086 0.02114 2291.48303 -1.933 0.05338 .
## freq_z -0.06035 0.02137 1619.40503 -2.825 0.00479 **
## density_bin_z -0.30218 0.02076 2321.15025 -14.557 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) rf_md_ yr_md_ freq_z
## ref_med_z 0.041
## year_med_z -0.032 -0.167
## freq_z -0.076 0.039 0.107
## densty_bn_z 0.023 -0.207 -0.541 -0.215
## [1] "--------------------"
## [1] "SBERT x log_cpy_var"
## [1] "--------------------"
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: y_z ~ ref_med_z + year_med_z + freq_z + density_bin_z + (1 |
## field)
## Data: df_vec
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 4975.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.0497 -0.6652 -0.0509 0.5603 4.2597
##
## Random effects:
## Groups Name Variance Std.Dev.
## field (Intercept) 0.1743 0.4175
## Residual 0.5131 0.7163
## Number of obs: 2266, groups: field, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 5.475e-02 1.425e-01 8.211e+00 0.384 0.7105
## ref_med_z 5.184e-02 3.000e-02 2.157e+03 1.728 0.0842 .
## year_med_z -1.764e-02 2.374e-02 2.086e+03 -0.743 0.4574
## freq_z 6.144e-03 1.981e-02 1.256e+03 0.310 0.7564
## density_bin_z -2.752e-01 2.938e-02 2.202e+03 -9.367 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) rf_md_ yr_md_ freq_z
## ref_med_z 0.050
## year_med_z -0.049 -0.207
## freq_z -0.112 0.034 -0.005
## densty_bn_z 0.029 -0.265 -0.562 -0.016
## [1] "--------------------"
## [1] "GPT2 x log_cpy_var"
## [1] "--------------------"
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: y_z ~ ref_med_z + year_med_z + freq_z + density_bin_z + (1 |
## field)
## Data: df_vec
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 9524.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.9929 -0.6787 -0.1161 0.5348 4.0942
##
## Random effects:
## Groups Name Variance Std.Dev.
## field (Intercept) 0.1470 0.3835
## Residual 0.6202 0.7875
## Number of obs: 4008, groups: field, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.09058 0.12844 7.93687 -0.705 0.50086
## ref_med_z -0.03980 0.02146 3609.80797 -1.854 0.06375 .
## year_med_z -0.08804 0.01611 3951.11446 -5.466 4.90e-08 ***
## freq_z 0.05186 0.01378 3997.93074 3.764 0.00017 ***
## density_bin_z -0.08760 0.01471 4001.39249 -5.953 2.86e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) rf_md_ yr_md_ freq_z
## ref_med_z -0.002
## year_med_z 0.002 -0.352
## freq_z -0.003 -0.026 -0.044
## densty_bn_z -0.004 0.154 -0.192 -0.352
## [1] "--------------------"
## [1] "Word2Vec x log_cpy_var"
## [1] "--------------------"
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: y_z ~ ref_med_z + year_med_z + freq_z + density_bin_z + (1 |
## field)
## Data: df_vec
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 6374.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.2903 -0.6572 -0.0841 0.5701 3.8504
##
## Random effects:
## Groups Name Variance Std.Dev.
## field (Intercept) 0.2574 0.5073
## Residual 0.5867 0.7660
## Number of obs: 2738, groups: field, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -7.979e-03 1.698e-01 7.959e+00 -0.047 0.9637
## ref_med_z -9.074e-03 3.054e-02 2.666e+03 -0.297 0.7664
## year_med_z -5.944e-03 2.413e-02 2.698e+03 -0.246 0.8054
## freq_z 1.871e-02 1.683e-02 2.733e+03 1.112 0.2663
## density_bin_z -8.373e-02 2.766e-02 2.731e+03 -3.027 0.0025 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) rf_md_ yr_md_ freq_z
## ref_med_z 0.016
## year_med_z 0.001 -0.188
## freq_z -0.014 0.009 -0.046
## densty_bn_z -0.007 -0.419 -0.563 -0.010
## [1] "--------------------"
## [1] "BOW x log_cpy_var"
## [1] "--------------------"
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: y_z ~ ref_med_z + year_med_z + freq_z + density_bin_z + (1 |
## field)
## Data: df_vec
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 6044
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.7600 -0.6873 -0.0779 0.5890 4.2408
##
## Random effects:
## Groups Name Variance Std.Dev.
## field (Intercept) 0.2609 0.5108
## Residual 0.5765 0.7593
## Number of obs: 2617, groups: field, 8
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 0.03425 0.18153 6.92650 0.189 0.8558
## ref_med_z 0.03151 0.02994 2371.48232 1.053 0.2926
## year_med_z 0.00572 0.01982 2605.87273 0.289 0.7730
## freq_z -0.03525 0.01962 2557.28915 -1.797 0.0724 .
## density_bin_z -0.20754 0.02900 2504.93731 -7.157 1.08e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) rf_md_ yr_md_ freq_z
## ref_med_z 0.036
## year_med_z -0.003 -0.215
## freq_z -0.034 -0.056 -0.081
## densty_bn_z -0.004 -0.245 -0.155 0.116